clear
clear matrix
set more off

* ------------------------------------------------------------------------------
* set paths
global root "[include file path here]"
global input "$root/Input"
global output "$root/Output"
global intermediate "$root/Intermediate"
global figures "$root/Figures"

* ______________________________________________________________________________
* create matched sample
use "$input/eventStudyPanel_raw.dta", clear
keep if conversions == 0 | (conversions == 1 & wg == "SF")

tab balanced
keep if balanced == 1 // consider only the balanced sample for matching
tab year

preserve
duplicates drop pid, force
tab conversions // about 11 % of my parcels are enrollees
tab event_year
restore

* drop variables that are no longer informative
foreach var of varlist earlyx balanced wg {
	tab `var', missing
	}
tab conversions wg, missing

drop earlyx balanced wg yearmonth
order pid year month wuse conversions tau enroll_date event_year event_month conv_sqft tot_rebated program

* keep only months you will match on
keep if month==7
sort pid year month
save "$intermediate/temp1.dta", replace

* --------------------------------------------------------------------
* --------------------------------------------------------------------
set more off
foreach enrollyear of numlist 1998/2014 {
use "$intermediate/temp1.dta", clear
* define indicator for months prior to enrollment date
local year2 = `enrollyear' - 2
local year3 = `enrollyear' - 3
local year4 = `enrollyear' - 4
local year5 = `enrollyear' - 5

display "sanity check: enroll year is `enrollyear' so min year is `year5' and max year is `year2'" 
gen preenroll = 0
replace preenroll = 1 if inrange(year,`year5',`year2') & conversions == 1 & event_year == `enrollyear'
	
* keep all non-enrollee observations and all preenroll period enrollee observations
tab conversions preenroll
keep if conversions == 0 | preenroll==1
tab conversions preenroll

* RESHAPE data, keeping only the preenroll enrollees' and non-enrollees' july water use
tab year conversions
keep if inrange(year,`year5',`year2') 
tostring year month, gen(yr mo)
gen date = "y" + yr + "m" + mo
drop year yr month mo

keep wuse pid conversions date lotsize
reshape wide wuse, i(pid conversions lotsize) j(date) string
duplicates report pid

rename wuse* *
rename _all, lower
duplicates report pid

* generate matched sample (teffects nnmatch)
la var conversions "1 if parcel enrolled in Cash-for-Grass"
la define conversions_defn 0 "Non-participant" 1 "Participant"
la values conversions conversions_defn
gen dummy_ovar = runiform() // generate a dummy outcome variable for teffects nnmatch
local cntrlvars "y`year5'm7 y`year4'm7 y`year3'm7 y`year2'm7"

if `enrollyear' == 1998 { // 1998 runs into collinearity problems if I match on all available years
	display "enroll year = `enrollyear': I am here but shouldn't be"
	teffects nnmatch (dummy_ovar y1995m7 y1996m7 lotsize) (conversions), vce(iid) biasadj(y1995m7 y1996m7 lotsize) generate(match) osample(unmatched)
	}
else if `enrollyear' > 1998 {
	display "enroll year = `enrollyear': I am here and should be"
	teffects nnmatch (dummy_ovar `cntrlvars' lotsize) (conversions), vce(iid) biasadj(`cntrlvars' lotsize) generate(match) osample(unmatched)
	}
tab unmatched
drop unmatched
tab conversions

gen obs_id = _n
sort pid
save "$intermediate/temp_temp.dta", replace

* create a list of parcels and respective observation _n's
keep pid obs_id
rename (pid obs_id) (matchparcel match1)
duplicates report matchparcel
duplicates report match1
sort match1
save "$intermediate/temp_parcelList.dta", replace

* merge parcel-obsNumber relationship back into post-teffects data
use "$intermediate/temp_temp.dta", clear
sort match1
merge m:1 match1 using "$intermediate/temp_parcelList.dta"
tab conversions _merge, missing 
// the unmatched from using must be non-participants that did not match

keep pid conversions match1 obs_id matchparcel _merge
keep if conversions == 1
tab _merge
list if _merge !=3 // un-matched participating parcels
drop if _merge !=3
la var matchparcel "matched non-participating parcel"
keep pid matchparcel

duplicates report pid
duplicates report matchparcel

sort pid
la da "list of enrollees and their corresponding matched non-enrollees for `enrollyear'"
save "$intermediate/matchedparcels_`enrollyear'.dta", replace
rm "$intermediate/temp_parcelList.dta"
rm "$intermediate/temp_temp.dta"


* generate list of parcels to keep for new consumption file
use "$intermediate/matchedparcels_`enrollyear'.dta", clear
keep pid
duplicates report pid
gen enroll = 1
sort pid
save "$intermediate/temp_enroll1.dta", replace

use "$intermediate/matchedparcels_`enrollyear'.dta", clear
keep matchparcel
duplicates report matchparcel
rename matchparcel pid
gen enroll = 0
sort pid
save "$intermediate/temp_enroll0.dta", replace

append using "$intermediate/temp_enroll1.dta"
gen matchid_`enrollyear' = 1
la var matchid_`enrollyear' "indicator for parcel matching for the `enrollyear' event year sub-sample"

duplicates report pid
la data "list of parcels used for matched sample for year `enrollyear'"
sort pid
save "$intermediate/matchedsample_parcels_`enrollyear'.dta", replace
rm "$intermediate/temp_enroll1.dta"
rm "$intermediate/temp_enroll0.dta"
}
* append parcels and their non-participating matches
use "$intermediate/matchedparcels_1998.dta", clear
foreach year of numlist 1999/2014 {
	append using "$intermediate/matchedparcels_`year'.dta"
	}
duplicates report pid
la data "particpating parcels and their corresponding non-participating matches"
sort pid
save "$intermediate/matchedparcels.dta", replace

* append series of annual matched samples
use "$intermediate/matchedsample_parcels_1998.dta", clear
foreach year of numlist 1999/2014 {
	append using "$intermediate/matchedsample_parcels_`year'.dta"
	}
duplicates report pid if enroll==1
duplicates report pid if enroll==0
tab enroll

duplicates tag pid, gen(parcelfreq) // variable for parcel weights
replace parcelfreq = parcelfreq + 1
duplicates drop pid, force

foreach sub of numlist 1998/2014 {
	replace matchid_`sub' = 0 if matchid_`sub' == .
}
* save matched sample list
la data "list of parcels used for matched sample"
sort pid
save "$intermediate/matchedsample_parcels.dta", replace

* ______________________________________________________________________________
* merge matched sample with raw enrollment file
use "$input/eventStudyPanel_raw.dta", clear
merge m:1 pid using "$intermediate/matchedsample_parcels.dta"
keep if _merge==3
drop _merge

* some sanity checks
tab conversions enroll, missing
tab earlyx, missing
tab balanced, missing
tab wg, missing
drop wg earlyx balanced

sort pid year month
la data "matched sample: matches on lot size and pre-conversion July consumption (2-5 yrs out)"
save "$output/eventStudyPanel_matched.dta", replace

* data file for mean water use plots
use "$output/eventStudyPanel_matched.dta", clear
collapse (mean) wuse (count) pid=wuse, by(year month enroll2) fast
sort enroll2 year month
save "$intermediate/matchpanel_meanwuse.dta", replace

* data file for participation plots
use "$output/eventStudyPanel_matched.dta", clear
duplicates drop pid, force
	
tabstat tot_rebated conv_sqft, stat(n mean sd min max sum) col(statistics) f(%9.0gc)
tabstat tot_rebated, stat(n mean sd min max sum) by(event_year) columns(statistics) f(%9.0gc)
tabstat conv_sqft, stat(n mean sd min max) by(event_year) columns(statistics) f(%9.0gc)
	
collapse (count) nconv = conv_sqft nrebate = tot_rebated (mean) meanconv = conv_sqft  ///
(sd) sdconv = conv_sqft (semean) seconv = conv_sqft (min) minconv=conv_sqft (max) maxconv = conv_sqft, by(event_year)
sort event_year
save "$intermediate/matchpanel_conversions.dta", replace

* create data files for mean water use and participation in each event year of the matched sample
foreach yr of numlist 1998/2014 {
	use "$output/eventStudyPanel_matched.dta", clear 
	// data file for mean water use plots for each individual year
	keep if matchid_`yr' == 1
	collapse (mean) wuse (count) pid=wuse, by(year month enroll2) fast
	sort enroll2 year month
	save "$intermediate/matchpanel_meanwuse_`yr'.dta", replace
		
	use "$output/eventStudyPanel_matched.dta", clear 
	// data file for participation plots for each individual year
	keep if matchid_`yr' == 1
	duplicates drop pid, force
	tabstat tot_rebated conv_sqft, stat(n mean sd min max sum) col(statistics) f(%9.0gc)
	collapse (count) nconv = conv_sqft nrebate = tot_rebated (mean) meanconv = conv_sqft  ///
	(sd) sdconv = conv_sqft (semean) seconv = conv_sqft (min) minconv=conv_sqft (max) maxconv = conv_sqft, by(event_year)
	sort event_year
	save "$intermediate/matchpanel_conversions_`yr'.dta", replace
	}

* merge application information into panel
sort pid year month
merge m:1 pid using "$input/applications.dta"
sort pid year month
drop if _merge == 2
tab _merge conversions
rename _merge _mergeApp

* generate pre-period indicator
gen pre = 0
replace pre = 1 if ((year == year(appdate) & month >= month(appdate)) | ///
(year > year(appdate))) & tau < 0

* generate zero consumption indicator
by pid: egen minwuse = min(wuse)
gen zero = 0
replace zero = 1 if minwuse == 0
la var zero "1 if parcel contains zero water use anywhere"

* update matched panel
compress
sort pid year month
save "$output/eventStudyPanel_matched.dta", replace